Hey guys! The following notebook will provide a walkthrough to perform sentiment analysis on SEC 10K Reports for the stock. We will be displaying two results, namely, the sentiments of a stock comparing it to its previous 5 years sentiments as well as the sentiment of a stock with respect to its competitors
Import the desired libraries to be used
from collections import Counter
from sec_edgar_downloader import Downloader
import os
import fnmatch
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import yfinance as yf
import numpy as np
import torch
import scipy
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from IPython.core.debugger import set_trace
from bs4 import BeautifulSoup
Download the 'punkt' dataset which will help to convert the text into a list of sentences based on British English
nltk.download('punkt')
[nltk_data] Downloading package punkt to [nltk_data] /Users/advaitkumar/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
##### Inputs #####
ticker = 'MS'
year = 2014
### Create tokenizers which will convert the input text (strings) to the relevant torch.Tensor to feed into the model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert") ## Used for sentiment analysis (positive/negative/neutral)
tokenizer_fls = BertTokenizer.from_pretrained("yiyanghkust/finbert-fls") ## Used for analysing the text to search for forward looking statements (FLS)
#### Initialise the NLP Models to be used for prediction
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert") ## Used for sentiment analysis (positive/negative/neutral)
model_fls = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-fls") ## Used for analysing the text to search for forward looking statements (FLS)
#### Initialise the dicts to store data for plots
competitors_dict = {}
historical_dict = {}
The checker function present in utils checks whether the EDGAR file is present in the directory or not, if it is found, it returns the file else returns None
def checker(directory, year):
"""
Inputs :
directory (str) : The directory to check for the file
year (int) : The year for the SEC 10K report
Outputs : The file which is present in the directory or None if no file exists
"""
if os.path.exists(directory):
pattern = '*-' + str(year % 100) + '-*' ### Check for the pattern which is some string followed by '-last_two_digits_of_year-' followed by some string which is the format for EDGAR files
found = False
for filename in os.listdir(directory):
if fnmatch.fnmatch(filename, pattern): ## If match found, open it and return the file
found = True
file = open(directory + '/' + filename + '/' + 'full-submission.txt')
return file
return None ## If no match found return None
else:
return None ### If directory doesnt exist, return None
The get_competitors function in utils extracts the nearest competitors for the current ticker based on the nearest market caps of all the stocks of the same industry as the ticker
def get_competitors(sheet_name, num_neighbours, ticker):
"""
Input:
sheet_name (str) : Name of the .csv file which has the symbol data
num_neighbours (int) : Number of competitors that need to be loaded
ticker (str) : The current ticker for which competitors need to be found
Output:
temp (list[str]) : List of the tickers of nearest 5 competitors (including the ticker itself)
"""
df = pd.read_csv(sheet_name)
x = df[(df.Symbol == ticker)] ## get data for current ticker
temp = df[df.Industry == x.Industry.iloc[0]] ## get all the stocks of the same industry
## Get the closest (num_neighbours) number of stocks according to the absolute difference in market cap
temp['Market Cap'] = temp['Market Cap'].astype(float).astype(int)
temp['cap_diff'] = np.abs(temp['Market Cap'] - int(float(x['Market Cap'].iloc[0])))
temp = temp.sort_values('cap_diff').Symbol.head(num_neighbours).tolist() ## convert to list
return temp
The get_section_location function present in the section_location file extracts the index locations of all the relevant sections in the text and returns a dataframe consisting of the starting index and the ending index of all the sections
def get_section_location(text):
"""
Input :
text (str) : The text from which the location of sections need to be extracted
Output :
text (str) : The input text
test_df (pd.DataFrame()) : A datframe consisting of all the start locations and ending locations of the relevant sections (items)
"""
regex = re.compile(r'(>Item(\s| | | )(1\.|1A|1B|1C|2|3|4|5|6|7|7A|8)\.{0,1})|(>ITEM(\s| | | )(1\.|1A|1B|1C|2|3|4|5|6|7|7A|8))') ### Check for the relevant patterns in the text (signifying the start and end of the item)
matches = regex.finditer(text) ## Get all the matches for the patterns
### Create a dataframe with the name, start and end index of each item
test_df = pd.DataFrame([[x.group(), x.start(), x.end()] for x in matches])
test_df.columns = ['group', 'start_idx', 'end_idx']
test_df.group = test_df.group.str.lower()
#### Replace all the regex characters with blank spaces to check for duplicates
test_df.replace(' ',' ',regex=True,inplace=True)
test_df.replace(' ',' ',regex=True,inplace=True)
test_df.replace(' ',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)
#### Drop the duplicates and only keep the last value which contains the actual text
test_df = test_df.drop_duplicates('group', keep = 'last')
test_df.index = range(len(test_df))
test_df = test_df.set_index('group')
return text, test_df
The process_text function present in process file reads in the text from a particular section and cleans it (removes all the regex and html keys) using BeautifulSoup to convert it into a set of english sentences.
def process_text(text, section, test_df):
"""
Input :
text (str) : The input text in regex format to be cleaned
section (str) : The label of the section which is to be cleaned
test_df (pd.DataFrame()) : The dataframe consisting of the start and end indices of all the items
Output :
clean_text (str) : The cleaned up and processed text
"""
location = 'item' + section ## get the index value to find
index_list = test_df.index.tolist() ## convert index to list to find value
curr_index = index_list.index(location) ## get index number
item_raw = text[test_df.iloc[curr_index]['start_idx'] : test_df.iloc[curr_index+1]['start_idx']] ## extract the text from the locations
#### Using BeautifulSoup remove all the tables present in the text
soup = BeautifulSoup(item_raw, 'html.parser')
for table in soup.find_all('table'):
table.decompose()
#### Extract text from the soup object removing the xml whitespace characters and tags
text_act = soup.get_text(separator=' ', strip=True)
text_act = text_act.replace('\xa0', ' ')
clean_text = ' '.join(text_act.split())
clean_text = re.sub(r'<span style=.*$', '', clean_text)
return clean_text
The prediction function in predict predicts the sentiments present in the given cleaned up text using the input NLP model and returns the predictions as well as the probability of the prediction
def prediction(model, tokenizer, X):
"""
Input :
model (AutoModelForSequenceClassification/BertModelForSequenceClassification) : The NLP model to be used for prediction
tokenizer (AutoTokenizer/BertTokenizer) : The tokenizer to convert the sentence into corresponding tokens to feed into the model
X (list[str]) : List of sentences on which the prediction needs to be done
Output :
preds (list[str]) : List of the predicted outputs for each sentence
preds_proba (list[float]) : Prob. of prediction being correct
"""
preds = []
preds_proba = []
tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512} ## initialise tokenizer params
for x in X:
with torch.no_grad(): ## So that the model weights remain fixed
input_sequence = tokenizer(x, return_tensors="pt", **tokenizer_kwargs) ## convert sentence into corresponding torch.Tensor
logits = model(**input_sequence).logits ## feed the input and get the corresponding logits
#### Calculate scores for each label using softmax and map it to the correct label according to the model config file
scores = {
k: v
for k, v in zip(
model.config.id2label.values(),
scipy.special.softmax(logits.numpy().squeeze()),
)
}
sentimentFinbert = max(scores, key=scores.get) ## Get the label that has the maximum score
probabilityFinbert = max(scores.values()) ## Get the prob of that label
preds.append(sentimentFinbert) ## append the label
preds_proba.append(probabilityFinbert) ## append the prob
return preds, preds_proba
The create_df function in create finally combines all the above functions to convert the user input ticker and year to the final sentiment predictions. It outputs a dict containing the percentage of positive sentiment sentences, negative sentiment sentences and forward looking statements (specific as well as non-specific).
def create_df(ticker, year, tokenizer, tokenizer_fls, model, model_fls):
"""
Input :
ticker (str) : The user defined ticker
year (int) : The user defined year
tokenizer (BertTokenizer) : The tokenizer to convert the sentence into corresponding tokens to feed into the model predicting sentiment
tokenizer_fls (AutoTokenizer) : The tokenizer to convert the sentence into corresponding tokens to feed into the model predicting forward looking statements
model (BertModelForSequenceClassification) : The NLP model to be used for predicting sentiment
model_fls (AutoModelForSequenceClassification) : The NLP model to be used for predicting forward looking sentences
Output :
dict : Contains percentage of positive sentiment sentences, negative sentiment sentences and forward looking statements (specific as well as non-specific
"""
dl = Downloader("MyCompanyName", "my.email@domain.com") ## create a dummy profile for web request
directory = 'sec-edgar-filings/' + ticker + '/10-K/' ### Directory to check if report already exists
file = checker(directory, year) ## Check if report exists
## If Report doesnt exist
if file is None:
start_date = str(year) + '-01-01'
end_date = str(year) + '-12-31'
dl.get("10-K", ticker, after=start_date, before=end_date) ### Get new report for the year
file = checker(directory, year) ### Check again if report is finally downloaded
### If Report still doesnt exist
if file is None:
print("10K report for " + ticker + " doesn't exist for " + str(year)) ### The report wasnt available on the website
return {'negatives' : np.nan, 'positives' : np.nan, 'fls' : np.nan} ## Return NaNs for plotting
#### If Report is downloaded properly
else:
text = file.read()
text, section_df = get_section_location(text) ## Get the df containing start and end positions of sections
preds_df = pd.DataFrame()
preds_fls_df = pd.DataFrame()
### Loop over relevant sections
for section in ['1', '1a', '1b', '1c', '2', '3', '4', '7', '7a']:
try:
clean_text = process_text(text, section, section_df) ### clean and process xml text of a particular section
sentences = sent_tokenize(clean_text) ### Convert the entire text into a list of sentences
### Only if the section has enough sentences perform analysis else skip to next section
if (len(sentences) > 5):
preds = {}
preds_fls = {}
preds['preds'], preds['prob'] = prediction(model, tokenizer, sentences) ## Make a prediction of the sentiment
preds_fls['preds'], preds_fls['prob'] = prediction(model_fls, tokenizer_fls, sentences) ## Make a prediction of forward looking nature
preds_df = pd.concat([preds_df, pd.DataFrame(preds)]) ## append to the existing df
preds_fls_df = pd.concat([preds_fls_df, pd.DataFrame(preds_fls)]) ## append to existing df
else:
print('Section ' + section + ' : has too few sentences')
### If exception raised => section wasnt present in the report
except:
print('Section ' + section + " doesn't exist in the report")
continue
### Calculate percentages of positive, negative and forward looking sentences (specific as well as non specific). Only count those predictions for which the confidence was above 0.5 (threshold)
pct_positives = len(preds_df[(preds_df.preds == 'positive') & (preds_df.prob > 0.5)])/len(preds_df[(preds_df.prob > 0.5)])
pct_negatives = len(preds_df[(preds_df.preds == 'negative') & (preds_df.prob > 0.5)])/len(preds_df[(preds_df.prob > 0.5)])
pct_fls = len(preds_fls_df[(preds_fls_df.preds != 'Not FLS') & (preds_fls_df.prob > 0.5)])/len(preds_fls_df[(preds_fls_df.prob > 0.5)])
return {'negatives' : pct_negatives, 'positives' : pct_positives, 'fls' : pct_fls}
The make_plots function in plot helps to make a subplot in a row, of the dataframe
def make_plots(fig, df, col, subplot_idx, title_text, legend_x, legend_y):
"""
Input :
fig (plotly.subplot) : The figure which needs to be populated with data
df (pd.DataFrame) : The data to be plotted
col (int): The position of the plot (row)
subplot_idx (int): The position of the plot in the row
title_text (str) : The title of the subplot
legend_x (float) : The relative x-coordinate of the legend
legend_y (float) : The relative y-coordinate of the legend
Output :
fig (plotly.subplot) : The figure after being populated with data
"""
bar_width = 0.2 ## width of bar in each group
num_cols = len(df.columns)
group_width = num_cols * bar_width ## width of each group
for i, column in enumerate(df.columns):
#### Add the bars in the figure
fig.add_trace(
go.Bar(
x=[x + i * bar_width for x in range(len(df))],
y=df[column],
name=column,
text=[f"{val:.2f}" for val in df[column]],
textposition='outside',
hoverinfo='text',
hovertext=[f"{val:.2f}" for val in df[column]],
width=bar_width,
legendgroup=f'group{subplot_idx}', # Assign legend group
showlegend=True # Show legend for each trace
),
row=1, col=subplot_idx
)
# Update subplot's x-axis
fig.update_xaxes(
tickvals=[x + group_width / 2 - bar_width / 2 for x in range(len(df))],
ticktext=df.index,
row=1, col=subplot_idx
)
# Customize the layout and position the legend
fig.update_layout(
title_text=title_text,
barmode='group',
showlegend=True,
legend=dict(x=legend_x, y=legend_y, orientation="h")
)
return fig
##### Inputs #####
ticker = 'MS'
year = 2014
### Load tokenizers for sentiment and FLS
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
tokenizer_fls = BertTokenizer.from_pretrained("yiyanghkust/finbert-fls")
### Load models for sentiment and FLS
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model_fls = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-fls")
### Load competitors for the ticker. all_stocks.csv contains all the Symbol data like ticker name and Market Cap
competitors = get_competitors('all_stocks.csv', 5, ticker)
/var/folders/p5/5wy4s80s1mb5bcs768v3lhv00000gn/T/ipykernel_6783/453538332.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy temp['Market Cap'] = temp['Market Cap'].astype(float).astype(int) /var/folders/p5/5wy4s80s1mb5bcs768v3lhv00000gn/T/ipykernel_6783/453538332.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy temp['cap_diff'] = np.abs(temp['Market Cap'] - int(float(x['Market Cap'].iloc[0])))
competitors
['MS', 'GS', 'SCHW', 'BLK', 'CME']
### initialize empty dicts, one for competitors data another for the stocks historical analysis data
competitors_dict = {}
historical_dict = {}
## Make predictions for each competitor
for stock in competitors:
competitors_dict[stock] = create_df(stock, year, tokenizer, tokenizer_fls, model, model_fls)
## If curr_stock is the user stock then populate the historical_dict as well
if (stock == ticker):
historical_dict[year] = competitors_dict[stock].copy()
## Make predictions for past 5 years of user input stock
for curr_year in range(year-4,year):
historical_dict[curr_year] = create_df(ticker, curr_year, tokenizer, tokenizer_fls, model, model_fls)
Section 1b : has too few sentences Section 1c doesn't exist in the report Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 7 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 3 : has too few sentences Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 2 : has too few sentences Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 3 : has too few sentences Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 4 : has too few sentences Section 7a doesn't exist in the report Section 1b : has too few sentences Section 1c doesn't exist in the report Section 4 : has too few sentences Section 7a doesn't exist in the report
## Convert the dicts to dfs both unnormalised and normalised
historical_df = pd.DataFrame(historical_dict).transpose().sort_index()
historical_df_norm = (historical_df - historical_df.mean())/historical_df.std()
competitors_df = pd.DataFrame(competitors_dict).transpose()
competitors_df_norm = (competitors_df - competitors_df.mean())/competitors_df.std()
## The dataframe containing historical sentiment analysis of the ticker
historical_df
| negatives | positives | fls | |
|---|---|---|---|
| 2010 | 0.145918 | 0.051020 | 0.176292 |
| 2011 | 0.149959 | 0.044010 | 0.210697 |
| 2012 | 0.172161 | 0.044689 | 0.180566 |
| 2013 | 0.175113 | 0.091883 | 0.138281 |
| 2014 | 0.159450 | 0.098341 | 0.150800 |
## The dataframe containing comparative sentiment analysis of the ticker w.r.t its closest 5 competitors
competitors_df
| negatives | positives | fls | |
|---|---|---|---|
| MS | 0.159450 | 0.098341 | 0.150800 |
| GS | 0.222500 | 0.062500 | 0.317559 |
| SCHW | 0.162162 | 0.143784 | 0.181720 |
| BLK | 0.239414 | 0.084691 | 0.334951 |
| CME | 0.200608 | 0.135258 | 0.339879 |
We plot the data next and make our observations. We see that the relative performance of CME is the best among all the competitors whereas Morgan Stanley (our ticker) has performed the worst on average. We rate a stock that has a relatively high positive sentiment, low negative sentiment and high FLS score, to have the best performance
from plotly.subplots import make_subplots
import plotly.graph_objects as go
#### Create 2 figures each having 2 subplots, for normalised and unnormalised versions
fig1 = make_subplots(rows=1, cols=2, subplot_titles=("Historical Sentiment Analysis", "Historical Sentiment Analysis Normalised"))
fig2 = make_subplots(rows=1, cols=2, subplot_titles=("Competitor Sentiment Analysis", "Competitor Sentiment Analysis Normalised"))
# Define legend positions for each subplot
make_plots(fig1, historical_df, 1, 1, 'Historical Data Analysis', legend_x=-2, legend_y=-0.1)
make_plots(fig1, historical_df_norm, 1, 2, 'Historical Data Analysis Normalised', legend_x=0.6, legend_y=-0.1)
make_plots(fig2, competitors_df, 1, 1, 'Competitor Data Analysis', legend_x=0.1, legend_y=-0.1)
make_plots(fig2, competitors_df_norm, 1, 2, 'Competitor Data Analysis Normalised', legend_x=0.6, legend_y=-0.1)
fig1.show()
fig2.show()
import yfinance as yf
comps = get_competitors('all_stocks.csv', 5, ticker)
stock_prices = pd.DataFrame()
stock_rets = pd.DataFrame()
### for all the competitors, we load the next years' close prices to valiudate our observations
for comp in comps:
tickerData = yf.Ticker(comp)
# Get the historical prices for this ticker
tickerDf = tickerData.history(start = str(year+1) + '-01-01', end = str(year+1) + '-12-31') # 1 month of data
# Show only the Close prices
close_prices = tickerDf['Close']
## Calculate the overall return of the stock in that year for a better relative comparision
close_prices_rets = close_prices/close_prices.iloc[0] - 1
stock_prices = pd.concat([stock_prices, close_prices.rename(comp)], axis = 1)
stock_rets = pd.concat([stock_rets, close_prices_rets.rename(comp)], axis = 1)
/var/folders/p5/5wy4s80s1mb5bcs768v3lhv00000gn/T/ipykernel_6783/453538332.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/p5/5wy4s80s1mb5bcs768v3lhv00000gn/T/ipykernel_6783/453538332.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import plotly.express as px
The below plot confirms our hypothesis. We can see that CME (along with SCHW) have the best returns for the year following the 10k reports that we analysed. Whereas MS (our ticker) has the worst performance. This follows our observation!
## Plot the returns of the competitors
px.line(stock_rets)
/opt/homebrew/lib/python3.10/site-packages/plotly/express/_core.py:1076: FutureWarning: Dtype inference on a pandas object (Series, Index, ExtensionArray) is deprecated. The Series constructor will keep the original dtype in the future. Call `infer_objects` on the result to get the old behavior.
As part of the second task I have created an online application using Streamlit viewer which generates a dashboard based on the user input ticker and year consisting of the same plots as discussed above. Below is the code for creating the dashboard as present in app. The application can be found on the url dashboard
import streamlit as st
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from utils import get_competitors
from create import create_df
from plot import make_plots
### Get a list of all the possible tickers for the dropdown menu
tickers = pd.read_csv('all_stocks.csv').Symbol.unique().tolist()
### Get a list of past 30 years
years = [x for x in range(1995,2024)]
# Title of the application
st.title('Stock Data Visualization')
# Dropdown to select the ticker
ticker = st.selectbox('Select Ticker', tickers)
# Dropdown to select the year
year = st.selectbox('Select Year', years)
## tokenizers
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
tokenizer_fls = BertTokenizer.from_pretrained("yiyanghkust/finbert-fls")
## models
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model_fls = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-fls")
## competitors of the stock based on industry and market cap
competitors = get_competitors('all_stocks.csv', 5, ticker)
competitors_dict = {}
historical_dict = {}
#### following steps are the same as described above for generating the plots #####
#############
for stock in competitors:
competitors_dict[stock] = create_df(stock, year, tokenizer, tokenizer_fls, model, model_fls)
if (stock == ticker):
historical_dict[year] = competitors_dict[stock].copy()
for curr_year in range(year-4,year):
historical_dict[curr_year] = create_df(ticker, curr_year, tokenizer, tokenizer_fls, model, model_fls)
fig1 = make_subplots(rows=1, cols=2, subplot_titles=("Historical Sentiment Analysis", "Historical Sentiment Analysis Normalised"))
fig2 = make_subplots(rows=1, cols=2, subplot_titles=("Competitor Sentiment Analysis", "Competitor Sentiment Analysis Normalised"))
# Define legend positions for each subplot
make_plots(fig1, historical_df, 1, 1, 'Historical Data Analysis', legend_x=-2, legend_y=-0.1)
make_plots(fig1, historical_df_norm, 1, 2, 'Historical Data Analysis Normalised', legend_x=0.6, legend_y=-0.1)
make_plots(fig2, competitors_df, 1, 1, 'Competitor Data Analysis', legend_x=0.1, legend_y=-0.1)
make_plots(fig2, competitors_df_norm, 1, 2, 'Competitor Data Analysis Normalised', legend_x=0.6, legend_y=-0.1)
#############
## Plot the figures on the streamlit application
st.plotly_chart(fig1)
st.plotly_chart(fig2)